#' ---
#' title: "Open Question"
#' author: "Will Lowe, Lukas F. Stoetzer"
#' date: "Feb 2020"
#' ---



# Read Data
datw <- readRDS("dat_wide.RDS")

  
# Translate
de <- datw$cjopenpref[datw$Q_Language == "DE"]
en <- datw$cjopenpref[datw$Q_Language == "EN"]
it <- datw$cjopenpref[datw$Q_Language == "IT"]
pl <- datw$cjopenpref[datw$Q_Language == "PL"]
pl_br <- datw$cjopenpref[datw$Q_Language == "PT-BR"]



# quick and dirty de counts
toks <- tokens(de, remove_punct = TRUE, 
               remove_symbols = TRUE, 
               remove_numbers = TRUE) %>%
        tokens_remove(stopwords("de")) %>%
        tokens_replace("überleb*", "überlebenschance") 
dfm_de <- dfm(toks)
  
de_dict <- list(
 age = c("alter","jüngere","jahre","älteren","jüngeren","age","jung","jünger",
         "jährigen","alte","70jähriger","30jähriger","40zig","jährige",
         "alter.allgemeinzustand","lebensalter","junger","jüngerer","aklter",
         "40jährigen","80jährigen","jpngeren","jüngsten","der80jährige","40jährige",
         "jüngeres","jahren","alter.kinder","geburtsjahrgang","60-jährigen","50-jährigen",
         "jähriger","junges","jüngste","altershöhe","jungem","40jähriger","80jähriger",
         "alters"),
 remaining_life = c("lebenserwartung","verbleibende","lebenszeit","potenziell","verfassung",
                    "restlebenszeit"),
 survival_chance = c("überlebenschance","widerstandsfähigkeit","ueberlebenschance","lebenschance",
                     "überebenswahrscheinlichkeit","chance","wahrscheinlichkeit","risiko",
                     "schanse","prozentuale", "möglichkeit","kinder7überlebungschance",
                     "chansengleichheit","genesungsvorhersage","heilungschance","ueberleben",
                     "dringlichkeitsprinzip","prognose","überlenschancen","uberlebenschance",
                     "überlenswahrscheinlichkeit","heilungscanche","chancen.kann",
                     "ueberlebenchance","erfolgsaussicht","selbstheilungschancen",
                     "wahrscheinlichkeiten","lebenschanc","űberlebenschance","risikochancen",
                     "genesungschancen","oberlebenschance","genesungschance","lebenschangse",
                     "aussichtslos","überlerlebenscance","cance","überlbenschance",
                     "genesungsprognose","gesundheitsvorgeschichte"
                     ),
 race = c("weiß","ethnische","herkunft","deutschland","🇩🇪"),
 party = c("partei","afd","parteizugehörigtkeit","nazis", "faschisten","parteineigung",
           "parteizugehörigkeit","parteibonzen", "aufpartei","patei","demos",
           "parteienneigung","parteikram","politischen","parteimitglied","parteifreund",
           "rechts-partei","politische"),
 children = c("kinder" ,"minderjährige","kind","kinder7überlebungschance","kindern",
              "alter.kinder","kinderanzahl-"),
 family = c("abhängige","lebenspartner","familie","familienstand","familienangehörige",
                "unabhängig","verheiratet","familienverhältnisse","angehörige",
            "familien","familen","schwangere","elternteil","mutter","ehepartner",
            "angehörig","hilflosigkeit","familiären","familienstatus","abhängigkeit"),
 sex = c("frauen","sex","weiblich"),
 education = c("ausbildung"),
 poverty = c("armen","ärmeren"),
 job = c("wichtigen","berufen","beruf","job","rolle","rolle","profession","berufstätig",
         "hartz","faul","einbezahlt","berufsleben","bildung","ärztin","jop","berufstätigkeit"
         ),
 system_relevance = c("wichtigkeit" ,"gesellschaft","systemrelevanter","system",
                      "pfleger","krankenhauspersonal","systemrelevant","systemrelevanz",
                      "systemrelevante","berufsgruppe","berufsbranche","gesellschaftsrelevant"
                      ),
 nationality = c("staatsangehörigkeit","deutsche"),
 health = c("gesundheitszustand","arbeitsfähigkeit","gesundheitliche",
            "gesundheitsstand", "gesundheitsrisiken","raucher","alkoholiker",
            "ausgangssituation","alter.allgemeinzustand","fähigkeit","gesundheit",
            "gesund","gesundheitszustanf","stand","sportliche","ges.heitszustand",
            "kränkere","fieber","durchfall","atem","not","krankheit","lebensgefahr",
            "kontiution","gesundheitswesen","beatmen","körperlich","vtalität","vitalzustand",
            "akutesten","lebenswille","lebensqualität","allgem.gesundheitszustand",
            "vorerkannung","gesundelebensweise","ehegatten","gesundung"),
 arrival_order = c("ankunft","zuerst","kam","einlieferung","einlieferungsstatus","angekommen",
                   "erstens","zweitens","eingeliefert","zweiter","gleichheitsprinzip",
                   "aufnahmezeitpunkt","verbindung","transport"),
 prev_condition = c("vorerkrankungen","körperliche","erkrankung","risikogruppe","vorerkrankung",
                    "risikopatient","konstitution", "vorerkankungen","riikopatient",
                    "akkut-zustand","krankheit","erkrankungsbild","vorerkrankunkg",
                    "risikogebiete","körperlicher","vorerkrakungen","herzkranke",
                    "erkrankungen","krebs","endstadium","immunsystem","körper",
                    "krankheitsverlauf","zugangsreihenfolge","vorerkankung",
                    "risikofaktoren","fettleibigkeit","herzerkrankung")
)

dfm_de_dict <-  dfm(toks, dictionary = dictionary(de_dict))
dfm_de_anycat <- convert(dfm_weight(dfm_de_dict, scheme = "boolean"), 
                         to = "data.frame")
rownames(dfm_de_anycat) <- dfm_de_anycat$doc_id
dfm_de_anycat$doc_id <- NULL
names(dfm_de_anycat) <- paste0("OQ_", names(dfm_de_anycat))

de_oq_dict <- dfm_de_anycat
de_oq_dict$psid <- datw$psid[datw$Q_Language == "DE"]

tma <- table(`At least one match?` = rowSums(dfm_de_anycat) != 0)
tno <- table(sapply(toks, function(x) length(x) == 0))


# Results ==== ###############

toks <- tokens(en, remove_punct = TRUE, 
               remove_symbols = TRUE, 
               remove_numbers = TRUE) %>%
  tokens_remove(stopwords("en")) 
dfm_en <- dfm(toks)

en_dict <- list(
  age = c("age","younger","older","children","young","yr","old","youngest",
          "minors","40-50","elderly","years", "old",
          # italian translation additions
          "age-absence","40-year-old", "senior", "youth"),
  remaining_life = c("life-span","years","already lived"
                     # italian translation additions
                     ),
  survival_chance = c("survival","recover","chances","percentages","survial","survive",
                      "potential", "sickest","surviral","survivial","recovery",
                      "probability","susurviving","40percent","surivial","surival",
                      "servival","surviveability","survivability","expectancy",
                      "survivile","success",
                      # italian translation additions
                      "doomed","pissibilita", "prognosis", "chance", "percent", "likely"),
  race = c("white","nationality",
           # italian translation additions
           "immigrant", "migrants"),
  party = c("political","party","affiliation","politics","survivalists","parties",
            "republican","demorate","democratic",
            # italian translation additions
            "membership", "citizenship", "constitution", "leghista", "non-partisanship",
            "fascism"),
  children = c("kids","child","dad","child's","parental",
               # italian translation additions
               "children's", "parents","daughter", "son", "sons", "pregnant",
               "offspring"),
  family = c("parent","father","family","dependents","dad","married",
             # italian translation additions
             "single", "mother", "childbearing"),
  sex = c("male","man","gender","men","sex","female","guy","females",
          # italian translation additions
          "women", "woman"),
  education = c("intellectuals",
                # italian translation additions
                "education"),
  poverty = c("low","paid"
              # italian translation additions
              ),
  tax = c("tax", "pay"), # italian only!
  rights = c("rights", "duties", "law"), # italian only
  job = c("nurses","profession","occupation","nurse","doctor","caregiver","boss",
          "physicians","employment","cooks","responders",
          "unemployed","professions", 
          # italian translation additions
          "nursing","professional","pédagogique","doctors"),
  system_relevance = c("contribution","society","economy","emt","police","officer",
                       "socitey","mankind","cook","societal","essential","caretaker"
                       # italian translation additions
                       ),
  nationality = c("citizen","black","country",
                  # italian translation additions
                  "italy", "italian"),
  health = c("style","hardiness","strength","healty","fighter","breathe",
             "lifestyle","muscular", "thin","physical condition","health",
             # italian translation additions
             "pathologies", "seriousness","drug","smoking", "healthier",
             "comorbidity", "fitness", "obesity", "lung", "cardiorespiratory"),
  arrival_order = c("arrival","order","first","1st","first-come-first","arrived",
                    # italian translation additions
                    "eta", "arrives", "precendence","precedence.everyone"),
  prev_condition = c("pre","existing","conditions","risk","factors","medical",
                     "history","chest","pain","fever","underlying",
                     "diabetes","heart","preexisting","pre-existing","comorbidities",
                     "pre-exsisting","respiratory","needing","immune","smoked",
                     "disabled","amputee","issues","status",
                     # italian translation additions
                     "gravity", "illnesses", "pneumonia")
)



dfm_en_dict <-  dfm(toks, dictionary = dictionary(en_dict))
dfm_en_anycat <- convert(dfm_weight(dfm_en_dict, scheme = "boolean"), 
                         to = "data.frame")
rownames(dfm_en_anycat) <- dfm_en_anycat$doc_id
dfm_en_anycat$doc_id <- NULL

names(dfm_en_anycat) <- paste0("OQ_", names(dfm_en_anycat))

en_oq_dict <- dfm_en_anycat
en_oq_dict$psid <- datw$psid[datw$Q_Language == "EN"]
write_rds(en_oq_dict, file = "dat_en_oq_dict.RDS")



##########################3

ll <- readLines("dat_it_txts_english-trans.txt")
toks <- tokens(ll, remove_punct = TRUE, 
               remove_symbols = TRUE, 
               remove_numbers = TRUE) %>%
  tokens_remove(stopwords("en")) 
dfm_it <- dfm(toks)

dfm_it_dict <-  dfm(toks, dictionary = dictionary(en_dict))
dfm_it_anycat <- convert(dfm_weight(dfm_it_dict, scheme = "boolean"), 
                         to = "data.frame")
rownames(dfm_it_anycat) <- dfm_it_anycat$doc_id
dfm_it_anycat$doc_id <- NULL

names(dfm_it_anycat) <- paste0("OQ_", names(dfm_it_anycat))

it_oq_dict <- dfm_it_anycat
it_oq_dict$psid <- datw$psid[datw$Q_Language == "IT"]
write_rds(it_oq_dict, file = "dat_it_oq_dict.RDS")

####### polish


ll <- readLines("dat_pl-txts-translated.txt")
toks <- tokens(ll, remove_punct = TRUE, 
               remove_symbols = TRUE, 
               remove_numbers = TRUE) %>%
  tokens_remove(stopwords("en")) 
dfm_br <- dfm(toks)

dfm_pl_dict <-  dfm(toks, dictionary = dictionary(en_dict))
dfm_pl_anycat <- convert(dfm_weight(dfm_pl_dict, scheme = "boolean"), 
                         to = "data.frame")
rownames(dfm_pl_anycat) <- dfm_pl_anycat$doc_id
dfm_pl_anycat$doc_id <- NULL

names(dfm_pl_anycat) <- paste0("OQ_", names(dfm_pl_anycat))

pl_oq_dict <- dfm_pl_anycat
pl_oq_dict$psid <- datw$psid[datw$Q_Language == "PL"]
write_rds(pl_oq_dict, file = "dat_out_pl_oq_dict.RDS")

########### BR Portugese

ll <- readLines("dat_br_txts-translation.txt")
toks <- tokens(ll, remove_punct = TRUE, 
               remove_symbols = TRUE, 
               remove_numbers = TRUE) %>%
  tokens_remove(stopwords("en")) 
dfm_br <- dfm(toks)

dfm_br_dict <-  dfm(toks, dictionary = dictionary(en_dict))
dfm_br_anycat <- convert(dfm_weight(dfm_br_dict, scheme = "boolean"), 
                         to = "data.frame")
rownames(dfm_br_anycat) <- dfm_br_anycat$doc_id
dfm_br_anycat$doc_id <- NULL

names(dfm_br_anycat) <- paste0("OQ_", names(dfm_br_anycat))

pt_br_oq_dict <- dfm_br_anycat
pt_br_oq_dict$psid <- datw$psid[datw$Q_Language == "PT-BR"]
write_rds(pt_br_oq_dict, file = "dat_pt_br_oq_dict.RDS")



##################################### focus on party

toks_en <- tokens(en)
toks_de <- tokens(de)
toks_it <- tokens(readLines("dat_it_txts_english-trans.txt"))
toks_pl <- tokens(readLines("dat_pl-txts-translated.txt"))
toks_br <- tokens(readLines("dat_br_txts-translation.txt"))

#### super qualitative

de_lst <- dictionary(list(p = c("partei","afd","parteizugehörigtkeit","nazis", "faschisten","parteineigung",
          "parteizugehörigkeit","parteibonzen", "aufpartei","patei","demos",
          "parteienneigung","parteikram","politischen","parteimitglied","parteifreund",
          "rechts-partei","politische")))
matches <- which(convert(dfm(toks_de, dictionary = de_lst), to = "data.frame")$p > 0)
# index counts from inside the language block
dd_de <- data.frame(index_within_lang = matches, answer = de[matches], stringsAsFactors = FALSE)


# english language versions
lst <- dictionary(list(p = c("political","party","affiliation","politics","survivalists","parties",
  "republican","demorate","democratic",
  # italian translation additions
  "membership", "citizenship", "constitution", "leghista", "non-partisanship",
  "fascism")))
matches <- which(convert(dfm(toks_en, dictionary = lst), to = "data.frame")$p > 0)
# index counts from inside the language block
dd_en <- data.frame(index_within_lang = matches, answer = en[matches], stringsAsFactors = FALSE)

matches <- which(convert(dfm(toks_it, dictionary = lst), to = "data.frame")$p > 0)
# index counts from inside the language block

## italian
ll <- readLines("dat_it_txts_english-trans.txt")
dd_it <- data.frame(index_within_lang = matches, 
                    answer = it[matches], 
                    transl = ll[matches],
                    stringsAsFactors = FALSE)

## polish
matches <- which(convert(dfm(toks_pl, dictionary = lst), to = "data.frame")$p > 0)
# index counts from inside the language block

ll <- readLines("dat_pl-txts-translated.txt")
dd_pl <- data.frame(index_within_lang = matches, 
                    answer = pl[matches], 
                    transl = ll[matches],
                    stringsAsFactors = FALSE)

## brazil
matches <- which(convert(dfm(toks_br, dictionary = lst), to = "data.frame")$p > 0)
# index counts from inside the language block

ll <- readLines("dat_br_txts-translation.txt")
dd_br <- data.frame(index_within_lang = matches, 
                    answer = pl_br[matches], 
                    transl = ll[matches],
                    stringsAsFactors = FALSE)

###### altogether now

offsets <- list(DE = 0,
     EN = length(de),
     IT = length(de) + length(en),
     PL = length(de) + length(en) + length(it),
     `PT-BR` = length(de) + length(en) + length(it) + length(pl)) 

res <- bind_rows(list(DE = dd_de, EN = dd_en, IT = dd_it, PL = dd_pl, `PT-BR` = dd_br),
          .id = "country")
res$index <- unlist(offsets[res$country]) + res$index_within_lang

# coded version is a separate file and exported from excel to csv 


## considering the rest
de_d <- colSums(select(de_oq_dict, starts_with("OQ")))
de_e <- colSums(select(en_oq_dict, starts_with("OQ")))
de_i <- colSums(select(it_oq_dict, starts_with("OQ")))
de_p <- colSums(select(pl_oq_dict, starts_with("OQ")))
de_b <- colSums(select(pt_br_oq_dict, starts_with("OQ")))
all_cats <- bind_rows(DE = de_d, EN = de_e, 
                      IT = de_i, PL = de_p, `PT-BR` = de_b, 
          .id = "country") %>%
  pivot_longer(starts_with("OQ"), names_to = "category", 
               values_to = "count") %>%
  extract(category, into = "category", regex = "OQ_(.*)") %>%
  mutate(count = ifelse(is.na(count), 0, count))
totals <- summarize(group_by(all_cats, country), n = sum(count))
all_cats <- left_join(all_cats, totals, by = "country") %>%
  mutate(percent = count / n * 100)

fac_order <- summarize(group_by(all_cats, category), n = sum(percent)) %>%
  arrange(n) %>%
  pull(category)

all_cats <- mutate(all_cats, 
                   country = factor(case_when(
                               country == "DE" ~ "Germany",
                               country == "EN" ~ "United States",
                               country == "IT" ~ "Italy",
                               country == "PL" ~ "Poland",
                               country == "PT-BR" ~ "Brazil"),
                               levels = c("Germany", "United States", 
                                          "Italy", "Poland", "Brazil")), 
                   category = factor(category, levels = fac_order)
                   )

annot <- data.frame(category = factor(rep("party", 5), levels = fac_order),
           percent = all_cats$percent[all_cats$category == "party"],
           country = c("Germany", "United States", "Italy", 
                       "Poland", "Brazil"))

######## Figure overall among categories

p <- ggplot(all_cats, aes(category, percent)) + 
  geom_col() + 
  labs(y = "Percent of responses containing category",
       x = "Category") + 
  geom_col(data = annot,
           aes = aes(country, percent), fill = "red") +
  facet_wrap(facets = vars(country), nrow = 3) +
  coord_flip()
  
ggsave("appendix_figure19.pdf", p, 
       width = 5, height = 7.5, units = "in")

resp <- table(respondents = datw$Q_Language)
ans <- table(answerers = datw$Q_Language[datw$cjopenpref != ""])
part <- table(party_mentioners = res$country)


breakdown <- read_csv("dat_coded_party_relevance.csv") %>%
  group_by(country) %>%
  summarize(n = sum(!is.na(c(party_important, party_unimportant))),
            party_important = sum(party_important, na.rm = TRUE),
            party_unimportant = sum(party_unimportant, na.rm = TRUE),
            important = 100 * sum(party_important, na.rm = TRUE) / n,
            unimportant = 100 * sum(party_unimportant, na.rm = TRUE) / n) %>%
  mutate(country = factor(case_when(
    country == "DE" ~ "Germany",
    country == "EN" ~ "United States",
    country == "IT" ~ "Italy",
    country == "PL" ~ "Poland",
    country == "PT-BR" ~ "Brazil"),
    levels = c("Germany", "United States", 
               "Italy", "Poland", "Brazil")))

breakdown_long_p <- pivot_longer(select(breakdown, c(1,5,6)), 2:3,
                                 names_to = "Preference", 
                                 values_to = "Percent")
breakdown_long_c <- pivot_longer(select(breakdown, c(1,3,4)), 2:3,
                               names_to = "Preference", 
                               values_to = "Count") %>%
  extract(Preference, into = "Preference", regex = "party_(.*)") %>%
  mutate(Percent = ifelse(Preference == "important", 90, 10))

######## Figure among those who think party is relevant

p <- ggplot(breakdown_long_p, aes(country, Percent, fill = Preference)) + 
  geom_col(color = NA) + 
  geom_text(data = breakdown_long_c, aes(country, Percent, label = Count),
            size = 3) +
  scale_fill_manual(values = list(important = "darkgray", 
                                  unimportant = "lightgray")) +
  labs(x = "Country", fill = "Party is...") +
  coord_flip() 

ggsave("appendix_figure20.pdf", p, width = 5.5, height = 2)


